File set-up

Set working directory to current directory

if (rstudioapi::isAvailable()) {
  setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
}

Load standard libraries and resolves conflicts

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(conflicted)
conflict_prefer("filter", "dplyr")
## [conflicted] Will prefer dplyr::filter over any other package.
conflict_prefer("select", "dplyr")
## [conflicted] Will prefer dplyr::select over any other package.
conflict_prefer("slice", "dplyr")
## [conflicted] Will prefer dplyr::slice over any other package.
conflict_prefer("rename", "dplyr")
## [conflicted] Will prefer dplyr::rename over any other package.
conflict_prefer('intersect', 'dplyr')
## [conflicted] Will prefer dplyr::intersect over any other package.

Set figure theme

mytheme = theme_bw(base_size = 10) + 
  theme(text = element_text(size=10, colour='black'),
        title = element_text(size=10, colour='black'),
        line = element_line(size=0.5),
        axis.title = element_text(size=10, colour='black'),
        axis.text = element_text(size=10, colour='black'),
        axis.line = element_line(colour = "black"),
        axis.ticks = element_line(size=0.5),
        strip.background = element_blank(),
        strip.text.y = element_blank(),
        panel.grid = element_blank(),
        panel.border = element_blank(),
        legend.text=element_text(size=10)) 
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
mytheme_discrete_x = mytheme + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))

Read data

counts = 
  read_tsv('data/D1515T43.R1_rc.fastq_slamdunk_mapped_filtered_tcount.tsv', skip = 2) %>%
  mutate(sample = '250_uM_S4U_1', sample_ID = "D1515T43", conc = 250, seq_read = 'R1', replicate = 1) %>%
  bind_rows(read_tsv('data/D1515T43.R2.fastq_slamdunk_mapped_filtered_tcount.tsv', skip = 2) %>%
              mutate(sample = '250_uM_S4U_1', sample_ID = "D1515T43", conc = 250, seq_read = 'R2', replicate = 1)) %>%
  bind_rows(read_tsv('data/D1515T44.R1_rc.fastq_slamdunk_mapped_filtered_tcount.tsv', skip = 2) %>%
              mutate(sample = '250_uM_S4U_2', sample_ID = "D1515T44", conc = 250, seq_read = 'R1', replicate = 2)) %>%
  bind_rows(read_tsv('data/D1515T44.R2.fastq_slamdunk_mapped_filtered_tcount.tsv', skip = 2) %>%
              mutate(sample = '250_uM_S4U_2', sample_ID = "D1515T44", conc = 250, seq_read = 'R2', replicate = 2)) %>%
  bind_rows(read_tsv('data/D1515T45.R1_rc.fastq_slamdunk_mapped_filtered_tcount.tsv', skip = 2) %>%
              mutate(sample = '125_uM_S4U_1', sample_ID = "D1515T45", conc = 125, seq_read = 'R1', replicate = 1)) %>%
  bind_rows(read_tsv('data/D1515T45.R2.fastq_slamdunk_mapped_filtered_tcount.tsv', skip = 2) %>%
              mutate(sample = '125_uM_S4U_1', sample_ID = "D1515T45", conc = 125, seq_read = 'R2', replicate = 1)) %>%
  bind_rows(read_tsv('data/D1515T46.R1_rc.fastq_slamdunk_mapped_filtered_tcount.tsv', skip = 2) %>%
              mutate(sample = '125_uM_S4U_2', sample_ID = "D1515T46", conc = 125, seq_read = 'R1', replicate = 2)) %>%
  bind_rows(read_tsv('data/D1515T46.R2.fastq_slamdunk_mapped_filtered_tcount.tsv', skip = 2) %>%
              mutate(sample = '125_uM_S4U_2', sample_ID = "D1515T46", conc = 125, seq_read = 'R2', replicate = 2)) %>%
  bind_rows(read_tsv('data/D1515T47.R1_rc.fastq_slamdunk_mapped_filtered_tcount.tsv', skip = 2) %>%
              mutate(sample = '0_uM_S4U_2', sample_ID = "D1515T47", conc = 0, seq_read = 'R1', replicate = 2)) %>%
  bind_rows(read_tsv('data/D1515T47.R2.fastq_slamdunk_mapped_filtered_tcount.tsv', skip = 2) %>%
              mutate(sample = '0_uM_S4U_2', sample_ID = "D1515T47", conc = 0, seq_read = 'R2', replicate = 2)) %>%
  bind_rows(read_tsv('data/D1515T48.R1_rc.fastq_slamdunk_mapped_filtered_tcount.tsv', skip = 2) %>%
              mutate(sample = '0_uM_S4U_3', sample_ID = "D1515T48", conc = 0, seq_read = 'R1', replicate = 3)) %>%
  bind_rows(read_tsv('data/D1515T48.R2.fastq_slamdunk_mapped_filtered_tcount.tsv', skip = 2) %>%
              mutate(sample = '0_uM_S4U_3', sample_ID = "D1515T48", conc = 0, seq_read = 'R2', replicate = 3))
## Rows: 276905 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): Chromosome, Name, Strand
## dbl (13): Start, End, Length, ConversionRate, ReadsCPM, Tcontent, CoverageOn...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 276905 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): Chromosome, Name, Strand
## dbl (13): Start, End, Length, ConversionRate, ReadsCPM, Tcontent, CoverageOn...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 276905 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): Chromosome, Name, Strand
## dbl (13): Start, End, Length, ConversionRate, ReadsCPM, Tcontent, CoverageOn...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 276905 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): Chromosome, Name, Strand
## dbl (13): Start, End, Length, ConversionRate, ReadsCPM, Tcontent, CoverageOn...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 276905 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): Chromosome, Name, Strand
## dbl (13): Start, End, Length, ConversionRate, ReadsCPM, Tcontent, CoverageOn...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 276905 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): Chromosome, Name, Strand
## dbl (13): Start, End, Length, ConversionRate, ReadsCPM, Tcontent, CoverageOn...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 276905 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): Chromosome, Name, Strand
## dbl (13): Start, End, Length, ConversionRate, ReadsCPM, Tcontent, CoverageOn...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 276905 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): Chromosome, Name, Strand
## dbl (13): Start, End, Length, ConversionRate, ReadsCPM, Tcontent, CoverageOn...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 276905 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): Chromosome, Name, Strand
## dbl (13): Start, End, Length, ConversionRate, ReadsCPM, Tcontent, CoverageOn...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 276905 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): Chromosome, Name, Strand
## dbl (13): Start, End, Length, ConversionRate, ReadsCPM, Tcontent, CoverageOn...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 276905 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): Chromosome, Name, Strand
## dbl (13): Start, End, Length, ConversionRate, ReadsCPM, Tcontent, CoverageOn...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 276905 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): Chromosome, Name, Strand
## dbl (13): Start, End, Length, ConversionRate, ReadsCPM, Tcontent, CoverageOn...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
counts

remove all transcripts that were not found

counts = counts %>% filter(ReadCount > 0)

counts

add annotation

anno = read_tsv('../../../scripts/gencode.v44.chr_patch_hapl_scaff.annotation_transcript_type.bed')
## Rows: 276905 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): chr, transcript_id, strand, transcript_type
## dbl (3): start, end, score
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
counts = counts %>% 
  rename('chr' = Chromosome,
         'start' = Start,
         'end' = End,
         'transcript_id' = 'Name',
         'strand' = Strand) %>%
  left_join(anno) 
## Joining with `by = join_by(chr, start, end, transcript_id, strand)`

simplify gene types

counts %>% count(transcript_type) %>% arrange(desc(n))
counts = counts %>%
  mutate(transcript_type_cat = ifelse(transcript_type == 'protein_coding', 'protein_coding', 'other'),
         transcript_type_cat = ifelse(transcript_type == 'lncRNA', 'lncRNA', transcript_type_cat))
         
counts$transcript_type_cat = factor(counts$transcript_type_cat, levels = c('protein_coding', 'lncRNA', 'other'))

counts
counts %>% count(transcript_type_cat) %>% arrange(desc(n))

Data-analysis

How many of the transcripts are labeled?

how many transcript have at least one T > C conversion

counts %>% 
  group_by(sample) %>%
  summarise(perc_conversed = 100 * sum(ConversionRate > 0) / n())
100*nrow(counts %>% filter(ConversionRate > 0)) / nrow(counts)
## [1] 88.46036

bar plot

counts %>% 
  group_by(sample, conc) %>%
  summarise(perc_conversed = sum(ConversionRate > 0.00) / n()) %>%
  mutate(conc = paste(conc, 'µM', sep = ' ')) %>% 
  ggplot(aes(sample, perc_conversed, fill = conc)) +
  geom_bar(stat = 'identity') +
  scale_y_continuous(labels = scales::percent_format()) +
  mytheme_discrete_x +
  scale_fill_manual(values = c('lightcoral', 'deepskyblue', 'forestgreen'))
## `summarise()` has grouped output by 'sample'. You can override using the
## `.groups` argument.

bar plot per read

perc_conversed = counts %>%
  group_by(sample, seq_read, conc) %>%
  summarise(nr_transcripts = n(),
            nr_conversed = sum(ConversionsOnTs > 0),
            perc_conversed = nr_conversed/nr_transcripts)
## `summarise()` has grouped output by 'sample', 'seq_read'. You can override
## using the `.groups` argument.
perc_conversed
perc_conversed %>%
  ggplot(aes(sample, perc_conversed, fill = seq_read)) +
  geom_bar(stat = 'identity', position = 'dodge') +
  scale_y_continuous(labels = scales::percent_format()) +
  mytheme_discrete_x +
  facet_wrap(~conc, scales = 'free_x')

with filter on minimal conversion rate

perc_conversed = counts %>%
  group_by(sample, seq_read, conc) %>%
  summarise(nr_transcripts = n(),
            nr_conversed = sum(ConversionsOnTs > 0),
            nr_conversed = sum(ConversionRate > 0.01),
            perc_conversed = nr_conversed/nr_transcripts) 
## `summarise()` has grouped output by 'sample', 'seq_read'. You can override
## using the `.groups` argument.
perc_conversed
perc_conversed %>%
  ggplot(aes(sample, perc_conversed, fill = seq_read)) +
  geom_bar(stat = 'identity', position = 'dodge') +
  scale_y_continuous(labels = scales::percent_format()) +
  mytheme_discrete_x +
  facet_wrap(~conc, scales = 'free_x')

how many labeled per gene category

perc_conversed = counts %>%
  #filter(ReadCount > 99) %>%
  group_by(sample, seq_read, conc, transcript_type_cat) %>%
  summarise(nr_transcripts = n(),
            nr_conversed = sum(ConversionsOnTs > 0),
            perc_conversed = nr_conversed/nr_transcripts) 
## `summarise()` has grouped output by 'sample', 'seq_read', 'conc'. You can
## override using the `.groups` argument.
perc_conversed
perc_conversed %>%
  ggplot(aes(sample, perc_conversed, fill = transcript_type_cat)) +
  geom_bar(stat = 'identity', position = 'dodge') +
  scale_y_continuous(labels = scales::percent_format()) +
  mytheme_discrete_x +
  facet_wrap(~conc, scales = 'free_x')

with filter on minimal conversion rate

perc_conversed = counts %>%
  #filter(ReadCount > 99) %>%
  group_by(sample, seq_read, conc, transcript_type_cat) %>%
  summarise(nr_transcripts = n(),
            nr_conversed = sum(ConversionsOnTs > 0),
            nr_conversed = sum(ConversionRate > 0.01),
            perc_conversed = nr_conversed/nr_transcripts) 
## `summarise()` has grouped output by 'sample', 'seq_read', 'conc'. You can
## override using the `.groups` argument.
perc_conversed
perc_conversed %>%
  ggplot(aes(sample, perc_conversed, fill = transcript_type_cat)) +
  geom_bar(stat = 'identity', position = 'dodge') +
  scale_y_continuous(labels = scales::percent_format()) +
  mytheme_discrete_x +
  facet_wrap(~conc, scales = 'free_x')

Conversion rate

median conversion rate

counts %>% 
  group_by(sample, seq_read) %>%
  summarise(median_conversionrate = median(ConversionRate))
## `summarise()` has grouped output by 'sample'. You can override using the
## `.groups` argument.

bar plot

counts %>% 
  mutate(conc = as.character(conc)) %>%
  group_by(sample, conc) %>%
  summarise(median_conversionrate = median(ConversionRate)) %>%
  ggplot(aes(sample, median_conversionrate, fill = conc)) +
  geom_bar(stat = 'identity', position = 'dodge') +
  scale_y_continuous(labels = scales::percent_format()) +
  mytheme_discrete_x +
  scale_fill_manual(values = c('lightcoral', 'forestgreen', 'deepskyblue')) +
  facet_wrap(~conc, scales = 'free_x') +
  geom_text(aes(label = sprintf("%0.2f", round(100 * median_conversionrate, digits = 4))), vjust = -0.2)
## `summarise()` has grouped output by 'sample'. You can override using the
## `.groups` argument.

bar plot with separate seq reads

counts %>% 
  mutate(conc = as.character(conc),
         conc_2 = paste(conc, '_', seq_read, sep = '')) %>%
  group_by(sample, seq_read, conc_2, conc) %>%
  summarise(median_conversionrate = median(ConversionRate)) %>%
  ggplot(aes(sample, median_conversionrate, fill = conc_2)) +
  geom_bar(stat = 'identity', position = 'dodge') +
  scale_y_continuous(labels = scales::percent_format()) +
  mytheme_discrete_x +
  scale_fill_manual(values = c('mistyrose1', 'lightcoral', 'lightblue', 'deepskyblue','lightgreen', 'forestgreen')) +
  #geom_text(aes(label = sprintf("%0.2f", round(100 * median_conversionrate, digits = 4))), vjust = -0.2) + 
  facet_wrap(~conc, scales = 'free_x')
## `summarise()` has grouped output by 'sample', 'seq_read', 'conc_2'. You can
## override using the `.groups` argument.

boxplot

counts %>%
  mutate(conc = as.character(conc)) %>%
  ggplot(aes(sample, ConversionRate, fill = conc)) +
  geom_boxplot() +
  scale_y_continuous(labels = scales::percent_format()) +
  scale_fill_manual(values = c('lightcoral', 'forestgreen', 'deepskyblue')) +
  mytheme_discrete_x

boxplot with separate seq reads

counts %>%
  mutate(conc = as.character(conc),
         conc_2 = paste(conc, '_', seq_read, sep = '')) %>%
  ggplot(aes(sample, ConversionRate, fill = conc_2)) +
  geom_boxplot() +
  scale_y_continuous(labels = scales::percent_format()) +
  scale_fill_manual(values = c('mistyrose1', 'lightcoral', 'lightblue', 'deepskyblue','lightgreen', 'forestgreen')) +
  mytheme_discrete_x

density plot

counts %>%
  mutate(conc = as.character(conc)) %>%
  ggplot((aes(ConversionRate, group = sample, color = conc))) +
  geom_density() +
  scale_x_continuous(labels = scales::percent_format())

correlation read count (CPM) and conversion rate

counts %>%
  mutate(conc = as.character(conc)) %>%
  ggplot(aes(ReadsCPM, ConversionRate, color = conc)) +
  geom_point(alpha = 0.2) +
  scale_y_continuous(labels = scales::percent_format()) +
  scale_x_log10() +
  facet_wrap(~sample)

counts %>%
  group_by(sample, seq_read) %>%
  summarise(mean_count = mean(ReadCount))
## `summarise()` has grouped output by 'sample'. You can override using the
## `.groups` argument.
counts %>% select(transcript_id) %>% unique()